Focus on the non-integrated dataset
# Import useful modules
import numpy as np
import pandas as pd
import scanpy as sc
import os
#import igraph
import matplotlib.pyplot as plt
import seaborn
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=90)
# Load the integrated and annotated dataset
adata_ann = sc.read_h5ad('/Data/Annotated_dataset_v1.h5ad')
# Load non-integrated & pre-processed dataset
adata_raw = sc.read_h5ad('/Data/PreProcessed_preliminary_dataset.h5ad')
# Transfer annotation onto a non-data integrated anndata object.
adata = adata_raw[adata_ann.obs.index.tolist(), ]
adata.obs['cell_type'] = adata_ann.obs['cell_type']
secretory_cell_bool = []
for x in adata.obs['cell_type']:
secretory_cell_bool = secretory_cell_bool + [x in ['Secretory', 'Secretory N']]
list_of_cell_names = adata.obs.loc[secretory_cell_bool, :].index.tolist()
adata = adata[list_of_cell_names, ]
adata.shape
# Remove unwanted genes (mitochondrial genes)
genes_names = adata.var.index.tolist()
keep_genes = [i for i in genes_names if not i.startswith('MT-')]
adata = adata[:,keep_genes]
adata.shape
sc.pp.filter_genes(adata, min_cells=100)
adata.X.shape
sc.pp.highly_variable_genes(adata, min_mean=0.01, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
np.sum(adata.var['highly_variable'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata, log = False)
sc.pl.pca_loadings(adata, components=list(range(0,10)))
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=7)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['manip', 'position'], edges = False)
sc.pl.umap(adata, color=['cell_type'], edges = False)
communities, graph, Q = sc.external.tl.phenograph(adata.obsm['X_pca'][:,0:7], k=100)
adata.obs['phenograph'] = pd.Categorical(communities)
sc.pl.umap(adata, color=['phenograph'])
adata.raw = adata
mean_cellType = np.empty((len(adata.obs['phenograph'].unique()), adata.raw.shape[1]),
dtype=float, order='C')
raw_adata = adata.raw.X
for i in range(0, len(adata.obs['phenograph'].unique())):
#print(adata.obs['phenograph'].unique()[i])
mean_cellType[i,:] = np.mean(raw_adata[adata.obs['phenograph'] == adata.obs['phenograph'].unique()[i], :], axis = 0)
mean_df = pd.DataFrame(np.corrcoef(mean_cellType), index = adata.obs['phenograph'].unique(), columns = adata.obs['phenograph'].unique())
ax = seaborn.clustermap(mean_df, cmap="jet").fig.suptitle('Phenograph')
sc.tl.rank_genes_groups(adata, groupby = 'phenograph', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
sc.pl.umap(adata, color=['phenograph'])
new_cluster_names = {
'0':'Secretory', '1':'Secretory N', '2':'Secretory', '3':'Secretory', '4':'Secretory N', # 0-4
'5':'Secretory','6':'Secretory', '7':'Secretory', '8':'Secretory N', '9':'Secretory N', '10':'Secretory N', # 5-9
'11':'Secretory', '12':'Secretory', '13':'Secretory N', '14':'Serous',
'15':'Secretory N'}
vect = []
for i in range(0, len(adata.obs['phenograph'])):
vect = vect + [new_cluster_names[str(adata.obs['phenograph'][i])]]
adata.obs['cell_type_detail'] = vect
sc.pl.umap(adata, color=['cell_type_detail'])
## Associate final cell types
adata_ann.obs['cell_type_v2'] = adata_ann.obs.cell_type.astype('str')
adata_ann.obs.loc[adata.obs.index.tolist(), 'cell_type_v2'] = adata.obs['cell_type_detail']
sc.pl.umap(adata_ann, color=['cell_type_v2'])
# Add annotation
adata = adata_raw[adata_ann.obs.index.tolist(), ]
adata.obs['cell_type_v2'] = adata_ann.obs['cell_type_v2']
adata.shape
immune_cell_bool = []
for x in adata.obs['cell_type_v2']:
immune_cell_bool = immune_cell_bool + [x in ['Secretory']]
list_of_cell_names = adata.obs.loc[immune_cell_bool, :].index.tolist()
adata_sc = adata[list_of_cell_names, ]
adata_sc.shape
# Remove unwanted genes (mitochondrial genes)
genes_names = adata_sc.var.index.tolist()
keep_genes = [i for i in genes_names if not i.startswith('MT-')]
adata_sc = adata_sc[:,keep_genes]
sc.pp.filter_genes(adata_sc, min_cells=100)
adata_sc.X.shape
sc.pp.highly_variable_genes(adata_sc, min_mean=0.01, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata_sc)
np.sum(adata_sc.var['highly_variable'])
sc.pp.scale(adata_sc, max_value=10)
sc.tl.pca(adata_sc, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata_sc, log = False)
sc.pl.pca_loadings(adata_sc, components=list(range(0,10)))
sc.pp.neighbors(adata_sc, n_neighbors=10, n_pcs=7)
sc.tl.umap(adata_sc)
sc.pl.umap(adata_sc, color=['manip', 'position'], edges = False)
sc.pl.umap(adata_sc, color=['MUC5AC', 'MUC5B', 'SCGB1A1'])
sc.pl.umap(adata_sc, color=['n_counts'])
communities, graph, Q = sc.external.tl.phenograph(adata_sc.obsm['X_pca'][:,0:7], k=100)
adata_sc.obs['phenograph'] = pd.Categorical(communities)
sc.pl.umap(adata_sc, color=['phenograph', 'manip'])
sc.tl.rank_genes_groups(adata_sc, groupby = 'phenograph', method='wilcoxon')
sc.pl.rank_genes_groups(adata_sc, n_genes=25, sharey=False)
sc.pl.umap(adata_sc, color=['phenograph'])
new_cluster_names = {
'0':'Secretory', '1':'Secretory', '2':'Secretory', '3':'Secretory', '4':'Secretory', # 0-4
'5':'Secretory','6':'Secretory', '7':'Secretory', '8':'Secretory', '9':'Secretory', '10':'Secretory', # 5-9
'11':'Secretory', '12':'Secretory', '13':'Secretory N'}
vect = []
for i in range(0, len(adata_sc.obs['phenograph'])):
vect = vect + [new_cluster_names[str(adata_sc.obs['phenograph'][i])]]
adata_sc.obs['cell_type_detail'] = vect
sc.pl.umap(adata_sc, color=['cell_type_detail'])
## Associate final cell types
adata_ann.obs['cell_type_v2'] = adata_ann.obs.cell_type_v2.astype('str')
adata_ann.obs.loc[adata_sc.obs.index.tolist(), 'cell_type_v2'] = adata_sc.obs['cell_type_detail']
sc.pl.umap(adata_ann, color=['cell_type_v2'])
# Add annotation
adata = adata_raw[adata_ann.obs.index.tolist(), ]
adata.obs['cell_type_v2'] = adata_ann.obs['cell_type_v2']
adata.shape
immune_cell_bool = []
for x in adata.obs['cell_type_v2']:
immune_cell_bool = immune_cell_bool + [x in ['Secretory']]
list_of_cell_names = adata.obs.loc[immune_cell_bool, :].index.tolist()
adata_sc = adata_raw[list_of_cell_names, ]
adata_sc.shape
adata_sc.raw = adata_sc
# Remove unwanted genes (mitochondrial genes)
genes_names = adata_sc.var.index.tolist()
keep_genes = [i for i in genes_names if not i.startswith('MT-')]
adata_sc = adata_sc[:,keep_genes]
sc.pp.filter_genes(adata_sc, min_cells=100)
adata_sc.X.shape
sc.pp.highly_variable_genes(adata_sc, min_mean=0.01, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata_sc)
np.sum(adata_sc.var['highly_variable'])
sc.pp.scale(adata_sc, max_value=10)
sc.tl.pca(adata_sc, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata_sc, log = False)
sc.pl.pca_loadings(adata_sc, components=list(range(0,10)))
sc.pp.neighbors(adata_sc, n_neighbors=100, n_pcs=4)
sc.tl.umap(adata_sc)
sc.pl.umap(adata_sc, color=['manip', 'position'], edges = False)
sc.pl.umap(adata_sc, color=['MUC5AC', 'MUC5B', 'SCGB1A1'])
sc.pl.umap(adata_sc, color=['n_counts'])
communities, graph, Q = sc.external.tl.phenograph(adata_sc.obsm['X_pca'][:,0:4], k=100)
adata_sc.obs['phenograph'] = pd.Categorical(communities)
sc.pl.umap(adata_sc, color=['phenograph', 'manip'])
sc.tl.rank_genes_groups(adata_sc, groupby = 'phenograph', method='wilcoxon')
sc.pl.rank_genes_groups(adata_sc, n_genes=25, sharey=False)
sc.pl.umap(adata_sc, color=['phenograph'])
new_cluster_names = {
'0':'Secretory', '1':'Secretory', '2':'Club', '3':'Club', '4':'Club', # 0-4
'5':'Club','6':'Secretory', '7':'Club', '8':'Club', '9':'Goblet', '10':'Club', # 5-9
'11':'Goblet', '12':'Club'}
vect = []
for i in range(0, len(adata_sc.obs['phenograph'])):
vect = vect + [new_cluster_names[str(adata_sc.obs['phenograph'][i])]]
adata_sc.obs['cell_type_detail'] = vect
sc.pl.umap(adata_sc, color=['cell_type_detail'])
sc.tl.rank_genes_groups(adata_sc, groupby = 'cell_type_detail', method='wilcoxon')
sc.pl.rank_genes_groups(adata_sc, n_genes=25, sharey=False)
## Associate final cell types
adata_ann.obs['cell_type_v2'] = adata_ann.obs.cell_type_v2.astype('str')
adata_ann.obs.loc[adata_sc.obs.index.tolist(), 'cell_type_v2'] = adata_sc.obs['cell_type_detail']
sc.pl.umap(adata_ann, color=['cell_type_v2'])
# Add annotation
adata = adata_raw[adata_ann.obs.index.tolist(), ]
adata.obs['cell_type_v2'] = adata_ann.obs['cell_type_v2']
adata.shape
immune_cell_bool = []
for x in adata.obs['cell_type_v2']:
immune_cell_bool = immune_cell_bool + [x in ['Secretory N']]
list_of_cell_names = adata.obs.loc[immune_cell_bool, :].index.tolist()
adata_sc = adata_raw[list_of_cell_names, ]
adata_sc.shape
adata_sc.raw = adata_sc
# Remove unwanted genes (mitochondrial genes)
genes_names = adata_sc.var.index.tolist()
keep_genes = [i for i in genes_names if not i.startswith('MT-')]
adata_sc = adata_sc[:,keep_genes]
sc.pp.filter_genes(adata_sc, min_cells=100)
adata_sc.X.shape
sc.pp.highly_variable_genes(adata_sc, min_mean=0.01, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata_sc)
np.sum(adata_sc.var['highly_variable'])
sc.pp.scale(adata_sc, max_value=10)
sc.tl.pca(adata_sc, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata_sc, log = False)
sc.pl.pca_loadings(adata_sc, components=list(range(0,10)))
sc.pp.neighbors(adata_sc, n_neighbors=100, n_pcs=3)
sc.tl.umap(adata_sc)
sc.pl.umap(adata_sc, color=['manip', 'position'], edges = False)
sc.pl.umap(adata_sc, color=['MUC5AC', 'MUC5B', 'SCGB1A1'])
sc.pl.umap(adata_sc, color=['n_counts'])
communities, graph, Q = sc.external.tl.phenograph(adata_sc.obsm['X_pca'][:,0:3], k=100)
adata_sc.obs['phenograph'] = pd.Categorical(communities)
sc.pl.umap(adata_sc, color=['phenograph', 'manip'])
sc.tl.rank_genes_groups(adata_sc, groupby = 'phenograph', method='wilcoxon')
sc.pl.rank_genes_groups(adata_sc, n_genes=25, sharey=False)
sc.pl.umap(adata_sc, color=['phenograph'])
new_cluster_names = {
'0':'Secretory', '1':'Secretory', '2':'Club', '3':'Club', '4':'Club', # 0-4
'5':'Club','6':'Secretory', '7':'Club', '8':'Club', '9':'Goblet', '10':'Club', # 5-9
'11':'Goblet', '12':'Club'}
vect = []
for i in range(0, len(adata_sc.obs['phenograph'])):
vect = vect + [new_cluster_names[str(adata_sc.obs['phenograph'][i])]]
adata_sc.obs['cell_type_detail'] = vect
sc.pl.umap(adata_sc, color=['cell_type_detail'])
sc.tl.rank_genes_groups(adata_sc, groupby = 'cell_type_detail', method='wilcoxon')
sc.pl.rank_genes_groups(adata_sc, n_genes=25, sharey=False)